/*THIS IS THE CBP_US_READER.DO FILE OF ACEMOGLU, AUTOR, DORN, HANSON, AND PRICE (JLE, 2016)
DOWNLOADED FROM https://www.ddorn.net/data/AADHP-GreatSag-FileArchive.zip
MAKES EXTENSIVE USE OF CROSSWALKS FILES IN XWALKS FOLDER (ALSO FROM AADHP)
CHANGES: DIRECTORIES, EXTENDED TO 2016
*/

/*
	Description: Cleans the national-level CBP data files
	
	Author: Brendan Price
	Updated: July 29, 2014
*/


include directories_build.do

/* Script for allocating aggregate NAICS industries to subordinate 6-digit industries */

capture program drop allocate_naics
program define allocate_naics
	foreach v in "emp" "qp1" "ap" "estab" {
		* Allocate aggregate industries recursively
		foreach i of numlist 5 4 3 2 {
			* Compute quantity of employment/payroll to be allocated
			bysort naics_`i'd: egen alloc_`v' = total(`v' * (agg_level == `i')) if naics_`i'd != ""
	
			* Compute total employment/payroll in subordinate 6-digit industries
			bysort naics_`i'd: egen level_`v' = total(`v' * (agg_level == 6)) if naics_`i'd != ""
	
			* Verify that recipient industries have positive employment/payroll.
			assert level_`v' > 0 & level_`v' < . if alloc_`v' > 0 & alloc_`v' < .

			* Compute each 6-digit industry's share of employment/payroll in the superordinate industry
			gen level_share = `v'/level_`v' if agg_level == 6
	
			* Allocate employment/payroll on the basis of shares
			replace `v' = `v' + (alloc_`v' * level_share) if agg_level == 6
			
			* Zero out the industries that were just allocated
			replace `v' = 0 if agg_level == `i'
		
			* Cleanup
			drop alloc_`v' level_`v' level_share
		}

		assert agg_level == 6 if `v' > 0
	}
	
	keep if agg_level == 6
end

/* Process raw national-level CBP data files */

foreach year in "1990" "1991" "1992" "1993" "1994" "1995" "1996" "1997" "1998" "1999" "2000" "2001" "2002" "2003" "2004" "2005" "2006" "2007" "2008" "2009" "2010" "2011" "2012" "2013" "2014" "2015" "2016" {

	* Extract 2-digit year
	local y = substr("`year'", 3, 2)

	* Load source data at the national level
	insheet using "`rawdir'`s'cbp`y'us.txt", comma clear

	* Mark year
	gen year = `year'
	
	* Verify that US code is vacuous
	assert uscode == 98
	drop uscode
		
	/*
		Integrity check industry codes:
		In the CBP observations are at different levels of aggregation. This has to be dealt with carefully.
	*/

	* Specify industry variable: SIC codes used through 1997, NAICS thereafter
	if `year' <= 1997 {
		local indvar "sic"
	}
	else {
		local indvar "naics"
	}

	* From 2008 onwards, CBP breaks everything down by establishment type; we just need aggregate
	if `year' >= 2008 {
		keep if lfo == "-"
		drop lfo
	}
	
	* Verify that industry is a unique identifier
	bysort `indvar': assert _N == 1

	* Process codes at different levels of aggregation
	if "`indvar'" == "sic" {
		* Verify that SIC codes are at the 4-digit level
		assert length(sic) == 4

		* Merge in details on the SIC hierarchy and verify all codes are accounted for
		merge 1:1 sic using "`nestingdir'`s'sic87_nesting.dta", assert(2 3)
		keep if _merge == 3
		drop _merge
	}
	else {
		* Verify that NAICS codes are at the 6-digit level
		assert length(naics) == 6

		* Merge in NAICS hierarchies and verify all codes are accounted for
		if year >= 1998 & year <= 2002 {
			merge m:1 naics using "`nestingdir'`s'naics97_nesting.dta", assert(2 3)
			keep if _merge == 3
			drop _merge
		}
		else if year >= 2003 & year <= 2007 {
			merge m:1 naics using "`nestingdir'`s'naics02_nesting.dta", assert(2 3)
			keep if _merge == 3
			drop _merge
		}
		else if year >= 2008 & year <= 2011 {
			merge m:1 naics using "`nestingdir'`s'naics07_nesting.dta", assert(2 3)
			keep if _merge == 3
			drop _merge
		}
		else if year >= 2012 & year <= 2016 {
			merge m:1 naics using "`nestingdir'`s'naics12_nesting.dta", assert(2 3)
			keep if _merge == 3
			drop _merge
		}
	}
	
	/*
		Process employment and payroll variables.
		Industry employment counts and payrolls are broken out by establishment size.
		I run integrity checks to make sure these data are internally consistent.
		I also impute missing employment counts using industries with non-missing values.
	*/

	* Total number of establishments should equal sum of establishment classes
	assert est != .
	assert est == n1_4 + n5_9 + n10_19 + n20_49 + n50_99 + n100_249 + n250_499 + n500_999 + n1000
	rename est estab

	* Loop over establishment size
	foreach r in "1_4" "5_9" "10_19" "20_49" "50_99" "100_249" "250_499" "500_999" "1000" {
		* Extract end points of the interval; note that range 1-4 is really like 0-4 in the data
		if "`r'" == "1_4" {
			local r_low = 0
			local r_high = 4
		}
		else if "`r'" == "1000" {
			local r_low = 1000
			local r_high = .
		}
		else {
			tokenize "`r'", parse("_")
			local r_low = `1'
			local r_high = `3'
		}
	
		* Missing-value flags should assume only two values prior to 2007
		if `year' <= 2006 {
			assert f`r' == "" | f`r' == "D"

			* Convert flags to dummies
			gen imp_`r' = (f`r' == "D")
		}
		else {
			* Convert flags to dummies: I discard some information here for consistent treatment across years
			gen imp_`r' = (f`r' != "")
		}
		drop f`r'
		
		* Employment should equal zero if flagged or if no establishments in category
		assert e`r' == 0 if imp_`r' == 1 | n`r' == 0

		* Payroll should equal zero if no establishments in category or (prior to 2007) if observation flagged
		if `year' <= 2006 {
			assert q`r' == 0 if imp_`r' == 1 | n`r' == 0
			assert a`r' == 0 if imp_`r' == 1 | n`r' == 0
		}
		else {
			assert q`r' == 0 if n`r' == 0
			assert a`r' == 0 if n`r' == 0
		}
		
		/* Impute values in suppressed cells */
		
		assert n`r' >= 0 & n`r' < .
		assert n`r' > 0 if agg_level == 0

		* First impute the number of employees in each establishment size class
		egen avg_e_`r' = total((e`r'/n`r') * (agg_level == 0))
		assert avg_e_`r' >= `r_low' & avg_e_`r' <= `r_high'

		* Impute employment whenever (1) the observation is flagged and (2) the value if missing; (1) does not always imply (2)
		replace e`r' = (avg_e_`r' * n`r') if imp_`r' == 1 & e`r' == 0
		assert e`r' >= 0 & e`r' < .
		drop avg_e_`r'

			
		* Next impute payroll variables for each establishment size class
		foreach v in "q" "a" {
			* Compute average wage in establishment class in aggregate
			egen avg_`v'_`r' = total((`v'`r'/e`r') * (agg_level == 0))
			assert avg_`v'_`r' > 0 & avg_`v'_`r' < .
			
			* Impute whenever (1) the observation is flagged and (2) the value if missing; (1) does not always imply (2)
			replace `v'`r' = (avg_`v'_`r' * e`r') if imp_`r' == 1 & `v'`r' == 0
			assert `v'`r' >= 0 & `v'`r' < .
			drop avg_`v'_`r'
			
		}
	}

	* Sum employment/payroll across establishment sizes to get a second measure of employment in each industry
	gen est_emp = e1_4 + e5_9 + e10_19 + e20_49 + e50_99 + e100_249 + e250_499 + e500_999 + e1000
	gen est_qp1 = q1_4 + q5_9 + q10_19 + q20_49 + q50_99 + q100_249 + q250_499 + q500_999 + q1000
	gen est_ap = a1_4 + a5_9 + a10_19 + a20_49 + a50_99 + a100_249 + a250_499 + a500_999 + a1000
	
	* Establishment-size variables no longer needed: later we may want to incorporate them
	drop e1_4 e5_9 e10_19 e20_49 e50_99 e100_249 e250_499 e500_999 e1000
	drop q1_4 q5_9 q10_19 q20_49 q50_99 q100_249 q250_499 q500_999 q1000
	drop a1_4 a5_9 a10_19 a20_49 a50_99 a100_249 a250_499 a500_999 a1000
	drop n1_4 n5_9 n10_19 n20_49 n50_99 n100_249 n250_499 n500_999 n1000
	drop imp_1_4 imp_5_9 imp_10_19 imp_20_49 imp_50_99 imp_100_249 imp_250_499 imp_500_999 imp_1000
	
	/* Impute missing employment counts using quantities estimated above */
	
	* Handle employment flags: variable may be either string or numeric (note: same flag applies for payroll)
	local empflag_type : type empflag
	
	
	if regexm("`empflag_type'", "str") == 0 {
		* empflag should be numeric only if it is uniformly missing
		assert empflag == .
		tostring empflag, replace
		replace empflag = "" if empflag == "."
	}

	* Total industry employment should equal zero iff flagged
	assert emp == 0 if empflag != ""
	assert emp > 0 if empflag == ""

	* Flag for whether I impute total employment/payroll
	gen emp_imputed = (empflag != "")

	/* Exploit information in empflag to correct estimated employment counts */
	
	* Tag industries for which I modify estimated employment count used in imputation
	gen est_emp_modified = 0
	
	foreach c in "A" "B" "C" "E" "F" "G" "H" "I" "J" "K" "L" "M" {
		* Stipulate employment intervals represented by each flag
		if "`c'" == "A" local range "0-19"
		if "`c'" == "B" local range "20-99"
		if "`c'" == "C" local range "100-249"
		if "`c'" == "E" local range "250-499"
		if "`c'" == "F" local range "500-999"
		if "`c'" == "G" local range "1000-2499"
		if "`c'" == "H" local range "2500-4999"
		if "`c'" == "I" local range "5000-9999"
		if "`c'" == "J" local range "10000-24999"
		if "`c'" == "K" local range "25000-49999"
		if "`c'" == "L" local range "50000-99999"
		if "`c'" == "M" local range "100000-."
		
		* Extract low and high end of each range
		local rmin = substr("`range'", 1, strpos("`range'", "-") - 1)
		local rmax = substr("`range'", strpos("`range'", "-") + 1, .)

		* Force estimated employment to fit within the indicated interval
		replace est_emp_modified = 1 if empflag == "`c'" & (est_emp < `rmin' | est_emp > `rmax')
		replace est_emp = `rmin' if empflag == "`c'" & est_emp < `rmin'
		replace est_emp = `rmax' if empflag == "`c'" & est_emp > `rmax'
	}

	* Impute industry employment as sum of (possibly imputed) employment values across classes
	replace emp = est_emp if empflag != ""

	/* Impute missing payroll values */
	
	gen qp1_imputed = 0
	gen ap_imputed = 0
	
	if `year' <= 2006 {
		* In these years, total payroll should equal zero iff flagged
		assert qp1 == 0 if empflag != ""
		assert qp1 > 0 if empflag == ""
		
		assert ap == 0 if empflag != ""
		assert ap > 0 if empflag == ""

		* Flag industries for which we impute payroll
		replace qp1_imputed = 1 if (empflag != "" & qp1 == 0)
		replace ap_imputed = 1 if (empflag != "" & ap == 0)

		* Impute missing payroll numbers
		replace qp1 = est_qp1 if (empflag != "" & qp1 == 0)
		replace ap = est_ap if (empflag != "" & ap == 0)
	}
	else {
		* Flag industries for which we impute payroll
		replace qp1_imputed = 1 if (empflag != "" | qp1_nf == "S") & qp1 == 0
		replace ap_imputed = 1 if (empflag != "" | ap_nf == "S") & ap == 0

		* Starting in 2007, payroll variables are sometimes suppressed due to poor data quality.
		* I impute in these cases as well.
		replace qp1 = est_qp1 if (empflag != "" | qp1_nf == "S") & qp1 == 0
		replace ap = est_ap if (empflag != "" | ap_nf == "S") & ap == 0
	}

	* empflag and estimated totals are no longer needed
	drop empflag qp1_imputed ap_imputed est_emp est_qp1 est_ap

	* Verify that employment, payroll, and number of establishments are positive in all industries
	assert emp > 0 & emp < .
	assert qp1 > 0 & qp1 < .
	assert ap > 0 & ap < .
	assert estab > 0 & estab < .

	
	/*
		Handle auxiliary and unclassified industries:
		(1) record total employment in auxiliary and unclassified industries
		(2) deduct auxiliary industries from the appropriate sectoral/aggregate totals
		(3) drop auxiliaries and unclassifieds

		Unclassified industries are easily deleted in all CBP extracts, but treatment of auxiliaries varies over time.
		In the 1987 SIC codes, each major sector (mining, manufacturing, etc.) has its own auxiliary code.
		In the 1997 NAICS, auxiliaries are classified into codes 95---- (auxiliaries) and 55---- (a management code).
		In the 2002 NAICS, code 55---- is used as before, but it appears that some industries formerly placed in 95---- are scattered throughout.
				
		The NAICS crosswalks map certain industry codes into "miscellaneous" codes like 20001, 30001, etc.
		These codes appear to correspond to employment coded as "auxiliary" in files coded using 1987 SIC codes.
		
		My procedure is therefore:
		- for files coded using 1987 SIC, drop the sector-specific auxiliary industries
		- for files coded using NAICS codes, drop industry 95---- as well as misc. codes 20001, 30001, etc.

		I do not explicitly drop code 55----, but some establishments in this category are mapped into 20001, etc. anyway.
		They are therefore dropped when we exclude these other auxiliaries.
		My chief rationale for not explicitly dropping 55---- is that NAICS code 551111 is mapped into SIC code 6710.
		Retaining NAICS code 551111 results in a fairly smooth series for SIC 6710, suggesting that 551111 does not correspond to establishments that would be treated as auxiliaries in the SIC files.
	*/
	
	if "`indvar'" == "sic" {
		* Deduct auxiliary counts from sectoral totals; otherwise auxiliary employment will be allocated
		foreach v in "emp" "qp1" "ap" "estab" {
			bysort sic87_1d: egen sic87_1d_`v'_aux = total(`v' * (auxiliary == 1))
			replace `v' = max(0, `v' - sic87_1d_`v'_aux) if agg_level == 1
			replace `v' = 0 if auxiliary == 1
			drop sic87_1d_`v'_aux
		}
				
		* Auxiliaries and unclassifieds are no longer needed
		drop if auxiliary == 1 | sic == "99--"
		drop auxiliary
	}
	else {
		* Auxiliaries and unclassifieds are no longer needed
		drop if naics == "95----" | naics == "99----"
		assert naics_2d != "20" & naics_2d != "21"
	}

	/*
		"Residualize" SIC industries at each level of aggregation.
		
		First, turn any observations at the 3-digit level into "residual industries,"
		i.e. fictitious industries corresponding to any nested 4-digit industries that do not appear in the data.
		Then do the same thing for observations that live at the 2-digit level, and again for the 1-digit level.
	*/

	if "`indvar'" == "sic" {
		* Loop over employment/payroll variables
		foreach v in "emp" "qp1" "ap" "estab" {
			foreach i of numlist 3 2 1 {
				local j = `i' + 1

				* Compute employment/payroll within each i-digit industry's j-digit constituents
				bysort sic87_`i'd: egen sic87_`i'd_`v' = total(`v' * (agg_level >= `j'))
				
				* Turn observations at the i-digit level into residual industries (zero out if the i-digit industry is too small)
				replace `v' = max(0, `v' - sic87_`i'd_`v') if agg_level == `i'
				drop sic87_`i'd_`v'
			}
		}

		* Specify hierarchy codes to be retained
		local nests "sic87_0d sic87_1d sic87_2d sic87_3d sic87_4d"
	}
	
	/*
		"Residualize" NAICS industries at each level of aggregation.
		
		First, turn any observations at the 5-digit level into "residual industries,"
		i.e. fictitious industries corresponding to any nested 6-digit industries that do not appear in the data.
		Then do the same thing for observations that live at the 4-digit level, then 3-digit and 2-digit levels.
	*/
	
	if "`indvar'" == "naics" {
		foreach v in "emp" "qp1" "ap" "estab" {
			foreach i of numlist 5 4 3 2 {
				local j = `i' + 1
						
				* Compute employment/payroll within each i-digit industry's j-digit constituents
				bysort naics_`i'd: egen naics_`i'd_`v' = total(`v' * (agg_level >= `j'))
							
				* Turn observations at the i-digit level into residual industries (zero out if the i-digit industry is too small)
				replace `v' = max(0, `v' - naics_`i'd_`v') if agg_level == `i'
				drop naics_`i'd_`v'
			}
		}
		
		* Specify hierarchy codes to be retained
		local nests "naics_0d naics_2d naics_3d naics_4d naics_5d naics_6d"
	}

	* Restrict to variables of interest
	keep year `indvar' emp qp1 ap estab agg_level `nests'

	* Express payroll variables in dollars (rather than $1000s)
	replace qp1 = qp1 * 1000
	replace ap = ap * 1000

	* Drop observation corresponding to aggregate employment
	drop if agg_level == 0

	/* Map 2012 NAICS codes into 2007 NAICS codes */
	if `year' >= 2012 & `year' <= 2016 {

		* Before mapping 2012 NAICS into 2007 NAICS, first need to allocate all aggregate industries
		allocate_naics
		assert agg_level == 6
		
		* Observations at the 6-digit level are purely numeric
		destring naics, replace

		* Merge in 2007 NAICS codes for codes that changed; all other codes remain unaffected
		rename naics naics12
		joinby naics12 using "`inddir'`s'naics`s'naics12_naics07.dta", unmatched(master)
		replace naics07 = naics12 if _merge == 1
		replace weight = 1 if _merge == 1

		* Collapse to the level of 2007 NAICS
		replace emp = emp * weight
		replace qp1 = qp1 * weight
		replace ap = ap * weight
		replace estab = estab * weight

		collapse (sum) emp (sum) qp1 (sum) ap (sum) estab, by(year naics07)
		rename naics07 naics

		tostring naics, replace
		
		* Recollapse to avoid duplicate industry codes
		collapse (sum) emp (sum) qp1 (sum) ap (sum) estab, by(year naics)

		* Merge in the 2007 NAICS hierarchy
		merge m:1 naics using "`nestingdir'`s'naics07_nesting.dta"
		keep if _merge == 3
		drop _merge	
	}
	
	/* Map 2007 NAICS codes into 2002 NAICS codes */

	if `year' >= 2008 & `year' <= 2016 {
		* Before mapping 2007 NAICS into 2002 NAICS, first need to allocate all aggregate industries
		allocate_naics
		assert agg_level == 6
		
		* Observations at the 6-digit level are purely numeric
		destring naics, replace

		* Merge in 2002 NAICS codes for codes that changed; all other codes remain unaffected
		rename naics naics07
		joinby naics07 using "`inddir'`s'naics`s'naics07_naics02.dta", unmatched(master)
		replace naics02 = naics07 if _merge == 1
		replace weight = 1 if _merge == 1

		* Collapse to the level of 2002 NAICS
		replace emp = emp * weight
		replace qp1 = qp1 * weight
		replace ap = ap * weight
		replace estab = estab * weight

		collapse (sum) emp (sum) qp1 (sum) ap (sum) estab, by(year naics02)
		rename naics02 naics
		tostring naics, replace
		
		* Recollapse to avoid duplicate industry codes
		collapse (sum) emp (sum) qp1 (sum) ap (sum) estab, by(year naics)

		* Merge in the 2002 NAICS hierarchy
		merge m:1 naics using "`nestingdir'`s'naics02_nesting.dta", assert(2 3)
		keep if _merge == 3
		drop _merge
	}

	/* Map 2002 NAICS codes into 1997 NAICS codes. */
	
	if `year' >= 2003 & `year' <= 2014 {
		* Before mapping 2002 NAICS into 1997 NAICS, first need to allocate all aggregate industries
		allocate_naics
		assert agg_level == 6

		* Observations at the 6-digit level are purely numeric
		destring naics, replace
		
		* Modify industry codes as needed to ensure proper mapping to 1997 codes
		replace naics = 425100 if naics == 425110	/* Business to business electronic markets */
		replace naics = 425100 if naics == 425120	/* Wholesale trade agents and brokers */
		collapse (sum) emp qp1 ap estab, by(year naics)

		* Merge in 1997 NAICS codes for codes that changed; all other codes remain unaffected
		rename naics naics02
		joinby naics02 using "`inddir'`s'naics`s'naics02_naics97.dta", unmatched(master)
		replace naics97 = naics02 if _merge == 1
		replace weight = 1 if _merge == 1

		* Weights should sum to unity within each county x industry, up to floating point error
		bysort naics02: egen tot_weight = total(weight)
		assert abs(1 - tot_weight) < .001

		* Collapse to the level of 1997 NAICS
		replace emp = emp * weight
		replace qp1 = qp1 * weight
		replace ap = ap * weight
		replace estab = estab * weight

		collapse (sum) emp qp1 ap estab, by(year naics97)
		rename naics97 naics
		tostring naics, replace
		
		* Modify industry codes that aren't properly handled by the official Census crosswalk
		replace naics = "233110" if naics == "237210"	/* Land subdivision */
		replace naics = "235920" if naics == "238150"	/* Glass and glazing contractors */
		replace naics = "235520" if naics == "238330"	/* Flooring contractors */
		replace naics = "235430" if naics == "238340"	/* Tile and Terrazzo Contractors */
		replace naics = "454110" if naics == "454112"	/* Electronic auctions */

		* Recollapse to avoid duplicate industry codes
		collapse (sum) emp (sum) qp1 (sum) ap (sum) estab, by(year naics)

		* Merge in the 1997 NAICS hierarchy
		merge m:1 naics using "`nestingdir'`s'naics97_nesting.dta", assert(2 3)
		keep if _merge == 3
		drop _merge
	}
	
	/* Map 1997 NAICS codes into 1987 SIC codes */

	if `year' >= 1998 & `year' <= 2016 {
		* Before mapping NAICS into SIC, first need to allocate all aggregate industries
		allocate_naics
		assert agg_level == 6

		* Observations at the 6-digit level are purely numeric
		destring naics, replace

		* Merge in SIC87 codes
		joinby naics using "`inddir'`s'naics97_sic87`s'naics97_sic87.dta", unmatched(both) _merge(naics_merge)
	
		* All NAICS industry codes should be accounted for
		assert naics_merge == 2 | naics_merge == 3
		keep if naics_merge == 3
		drop naics_merge
		
		* Weights should sum to unity within each industry, up to floating point error
		bysort naics: egen tot_weight = total(weight)
		assert abs(1 - tot_weight) < .001

		* Reweight all quantities	
		replace emp = emp * weight
		replace qp1 = qp1 * weight
		replace ap = ap * weight
		replace estab = estab * weight

		* Record quantity of labor in "allocated" auxiliary industries (i.e., outside the designated auxiliary codes)
		gen aux = 0
		foreach s in 10001 20001 30001 40001 50001 60001 70001 80001 90001 {
			replace aux = 1 if sic == `s'
		}
		
		egen tot_emp = total(emp)
		egen tot_emp_aux = total(emp * (aux == 1))
		gen aux_share = tot_emp_aux/tot_emp
	
		* Drop auxiliary industries
		drop if aux == 1
		
		* Collapse to the SIC87 level		
		collapse (sum) emp (sum) qp1 (sum) ap (sum) estab, by(year sic)
		
		* Convert back to string, since for years with native SIC codes, sic is a string at this point
		tostring sic, replace
		replace sic = "0" + sic if length(sic) == 3
		assert length(sic) == 4
		
		* Merge in the SIC hierarchy
		merge 1:1 sic using "`nestingdir'`s'sic87_nesting.dta", assert(2 3)
		keep if _merge == 3
		drop _merge

		* Exclude auxiliary employment as well as unclassified establishments
		drop if auxiliary == 1
		drop if sic == "99--"
	}
	
	/*
		Aggregate SIC industries as needed to ensure balanced representation.
		Most aggregations are performed by the script sic87_sic87dd.do.
		Here, I perform only those aggregations needed to ensure proper functioning of this cleaning code.
	*/
	
	* Combine "Farm labor contractors" with "Farm management services".
	* Need to do this because, starting in 1998, the codes subordinate to 0760 have no employment.
	replace sic = "0760" if sic == "0761" | sic == "0762"

	foreach v in "emp" "qp1" "ap" "estab" {
		egen `v'_sum = total(`v' * (sic == "0760"))
		replace `v' = `v'_sum if sic == "0760"
		drop `v'_sum
	}

	drop if sic == "0760" & sic87_4d != ""
	replace agg_level = 4 if sic == "0760"
	replace sic87_4d = "0760" if sic == "0760"
	quietly count if sic == "0760"
	assert r(N) == 1
	
	/* Allocate aggregate industries in the SIC datasets */

	if `year' >= 1988 & `year' <= 2016 {
		foreach v in "emp" "qp1" "ap" "estab" {
			* Allocate aggregate industries recursively
			foreach i of numlist 3 2 1 {
				* Compute quantity of the outcome to be allocated
				bysort sic87_`i'd: egen alloc_`v' = total(`v' * (agg_level == `i')) if sic87_`i'd != ""

				* Compute total value of the outcome in subordinate 4-digit industries
				bysort sic87_`i'd: egen level_`v' = total(`v' * (agg_level == 4)) if sic87_`i'd != ""
	
				* Verify that recipient industries have positive values of the outcome
				assert level_`v' > 0 & level_`v' < . if alloc_`v' > 0 & alloc_`v' < .
	
				* Compute each 4-digit industry's share of the outcome in the superordinate industry
				gen level_share = `v'/level_`v' if agg_level == 4
	
				* Allocate the outcome on the basis of shares
				replace `v' = `v' + (alloc_`v' * level_share) if agg_level == 4

				* Zero out the industries that were just allocated
				replace `v' = 0 if agg_level == `i'
	
				* Cleanup
				drop alloc_`v' level_`v' level_share
			}
			
			assert agg_level == 4 if `v' > 0
		}
	}

	keep if agg_level == 4

	/* Map 1987 SIC codes into sic87dd codes and collapse to aggregate combined observations */

	* Map to sic87dd codes
	destring sic, replace	
	rename sic sic87
	do "`inddir'`s'sic87_sic87dd`s'sic87_sic87dd.do"

	collapse (sum) emp (sum) qp1 (sum) ap (sum) estab, by(year sic87dd)	
	
	/* Aggregate non-manufacturing industries to a level that can be linked to the I/O matrix */

	gen manuf = (sic87dd >= 2000 & sic87dd <= 3999)
	merge m:1 sic87dd using "`inddir'`s'sic2io`s'sic87dd_to_sic87dd_pooled.dta", keepusing(sic87dd_pooled)
	assert _merge == 1 if manuf == 1
	assert _merge == 3 if manuf == 0
	drop _merge

	replace sic87dd = sic87dd_pooled if manuf == 0
	bysort sic87dd (manuf): assert manuf[1] == manuf[_N]
	drop sic87dd_pooled

	collapse (sum) emp (sum) qp1 (sum) ap (sum) estab, by(year sic87dd)

	* Save the clean dataset
	rename qp1 nom_qp1
	rename ap nom_ap
	save "`cbpdir'`s'cbp`y'us.dta", replace
}

/* Combine cleaned CBP extracts into a single file */

use "`cbpdir'`s'cbp90us.dta", clear
append using "`cbpdir'`s'cbp91us.dta"
append using "`cbpdir'`s'cbp92us.dta"
append using "`cbpdir'`s'cbp93us.dta"
append using "`cbpdir'`s'cbp94us.dta"
append using "`cbpdir'`s'cbp95us.dta"
append using "`cbpdir'`s'cbp96us.dta"
append using "`cbpdir'`s'cbp97us.dta"
append using "`cbpdir'`s'cbp98us.dta"
append using "`cbpdir'`s'cbp99us.dta"
append using "`cbpdir'`s'cbp00us.dta"
append using "`cbpdir'`s'cbp01us.dta"
append using "`cbpdir'`s'cbp02us.dta"
append using "`cbpdir'`s'cbp03us.dta"
append using "`cbpdir'`s'cbp04us.dta"
append using "`cbpdir'`s'cbp05us.dta"
append using "`cbpdir'`s'cbp06us.dta"
append using "`cbpdir'`s'cbp07us.dta"
append using "`cbpdir'`s'cbp08us.dta"
append using "`cbpdir'`s'cbp09us.dta"
append using "`cbpdir'`s'cbp10us.dta"
append using "`cbpdir'`s'cbp11us.dta"
append using "`cbpdir'`s'cbp12us.dta"
append using "`cbpdir'`s'cbp13us.dta"
append using "`cbpdir'`s'cbp15us.dta"
append using "`cbpdir'`s'cbp16us.dta"

compress
save "`cbpdir'`s'cbp_national_1990_2016.dta", replace


forvalues year = 1990/2016 {
	local yy = substr("`year'", 3, 2)
	rm "`cbpdir'`s'cbp`yy'us.dta"
}

